# The replication file for the main results of the Elections paper

library(countrycode)
library(lubridate)
library(ggplot2)
library(reshape)
library(caret)
library(lme4)
library(RColorBrewer)
library(hydroGOF)

setwd("~/Downloads/Science Replication and Data - Polling/")
dat <- read.csv("global_polling_replication_data.csv")

###########################
# Essential Feature Creation
#######################
dat$wts <- scale(1/(dat$days.to.elec+2), center=F) 
dat$pro.incumbent <- scale(dat$pro.incumbent, center=F)
dat$logpolls <- log(abs(dat$pollmarg)+.00000001)
dat$sm.pollmarg <- NA
dat$Round.Date <- as.Date(as.character(dat$Round.Date), format="%m/%d/%Y")

# get elections that are trained and tested
elections.training <- read.csv("elections_training_wround09082016.csv")

########################
#Some basic descriptives
####################

dat <- dat[!is.na(dat$pollmarg), ]
dat <- dat[!is.na(dat$Poll.Date), ]

#Aggregating
sm.agg <- aggregate(cbind(pollmarg, realmarg, Round.Date, gdp.growth, year, region=as.numeric(as.factor(region)), incRun, l1polity2, wb.inflation, asp.gdp)~electionid, median, data=dat)
sm.agg = sm.agg[order(sm.agg$Round.Date), ]

#Summary of proportions in which incumbents and non-incumbents run and win
table(sm.agg$incRun, sm.agg$realmarg>0)/sum(!is.na(sm.agg$pollmarg))

#Rate of re-election for incumbents - 72.4%
total.inc <- sum(table(sm.agg$incRun, sm.agg$realmarg>0)[2,])
table(sm.agg$incRun, sm.agg$realmarg>0)[2,]/total.inc

#Rate of election for non-incumbents (chosen successors) - 55.68%
total.noninc <- sum(table(sm.agg$incRun, sm.agg$realmarg>0)[1,])
table(sm.agg$incRun, sm.agg$realmarg>0)[1,]/total.noninc

#Naive model of just picking the incumbent candidate - 62.3% 
table(sm.agg$realmarg>0)/length(sm.agg$realmarg)

#Rate of election if polls say so
table(sm.agg$pollmarg*sm.agg$realmarg>=0)/sum(!is.na(sm.agg$pollmarg)) #overall - 86.3%
table(sm.agg$pollmarg[sm.agg$incRun==1]*sm.agg$realmarg[sm.agg$incRun==1]>=0)/sum(sm.agg$incRun==1) #incumbent officeholder running
table(sm.agg$pollmarg[sm.agg$incRun==0]*sm.agg$realmarg[sm.agg$incRun==0]>=0)/sum(sm.agg$incRun==0) #non-incumbent officeholder running

# RMSE OF NAIVE Poll-Only model
table(sm.agg$pollmarg[51:146]*sm.agg$realmarg[51:146]>0)/sum(!is.na(sm.agg$pollmarg[51:146])) #out of sample accuracy
postResample(sm.agg$pollmarg, sm.agg$realmarg)
postResample(sm.agg$pollmarg[51:146], sm.agg$realmarg[51:146])

### ANALYSIS SECTION

# Smoothing Model
trainingsize <- 50
dat$sm.pollmarg <- NA

# Step: creating the smoothed estimate for the training set
# Step: create loop creating smoothed estimate for each election in the testing data
get_preds <- function(pdat=dat, trainingset=elections.training, trainingsize= 50, leadTime = 0){
  pdat$wts <- scale(1/(pdat$days.to.elec-leadTime+2), center=F) #
  
  # Remove missing polls and dates
  pdat <- pdat[!is.na(pdat$pollmarg), ]
  pdat <- pdat[!is.na(pdat$Poll.Date), ]
  
  for(i in 1:sum(elections.training$trainingset==0)){
    
    elecs <- pdat$electionid %in% elections.training$electionid[ 1:(trainingsize+i)] #get T/F in or out of training set
    
    #########
    sm.mod <- try(lmer(pollmarg ~ pro.incumbent + (pro.incumbent|incRun) + (1|asp.gdp.bin) +(1|electionid)+(1|ccode)+(1|Pollster)+(1|region), na.action=na.exclude, weights=wts, data=pdat[ elecs, ], control=lmerControl(optimizer="bobyqa", boundary.tol = 1e-2, check.scaleX="silent.rescale")) )
    #error handling
    w <- warnings(); assign("last.warning", NULL, envir = baseenv()) #assign warning to w, then delete all warnings
    
    #######
    if(is.null(w)==F){ #if there is a warning, then print as such
      print(paste("warning", i, "did not converge"))
      w <- NULL
      assign("last.warning", NULL, envir = baseenv())
    }
    #######
    if(i==1){ #if it is the first round of the loop, predict the training set plus the first test election
      pdat$sm.pollmarg[ elecs] <- predict(sm.mod, newdata=pdat[ elecs,])    
    }
    #######
    if(i>1){ #if it is not the first round of the loop, predict just the next test election
      next.elec <- pdat$electionid %in% elections.training$electionid[ trainingsize + i ] #get just the election to make smoothed pred.
      pdat$sm.pollmarg[ next.elec ] <- predict(sm.mod, newdata=pdat[next.elec, ]) #produces a prediction
      print(elections.training$electionid[ trainingsize + i ])
    }
    print(paste(i, "successfully completed"))
  }
  return(pdat)
}

pdat1 <- get_preds(pdat=dat, trainingset=elections.training, trainingsize=50, leadTime = 0)

# Aggregating the smoothed polling margin and all the rest
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, pollmarg, realmarg, year, region=as.numeric(as.factor(region)), incRun, incApp, incExtend,  multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat1)
sum(sm.agg$sm.pollmarg*sm.agg$realmarg > 0, na.rm=T) / sum(!is.na(sm.agg$sm.pollmarg))

##############################
# Outcome model without polling data (Structural Model)
################
trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
#linear partial least squares with structural features only
sm.agg <- sm.agg[order(sm.agg$Round.Date), ]
tr.stru <- train(realmarg~l1polity2+wb.inflation+incRun+asp.gdp, data=sm.agg, trControl=trainControl(method="none"), metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 
sum( predict(tr.stru, newdata=sm.agg[1:50,])*sm.agg$realmarg[1:50] > 0 )/nrow(sm.agg[1:50,]) # the in-sample acc. rate
postResample( predict(tr.stru, newdata=sm.agg[1:50,]), sm.agg$realmarg[1:50]) # the in-sample error

#out of sample error for Structural Model
tr.stru <- train(realmarg~l1polity2+wb.inflation+incRun+asp.gdp, data=sm.agg, trControl=trcontrol, metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 
sum( (tr.stru$pred$pred* tr.stru$pred$obs) >= 0) / length(tr.stru$pred$pred) #the out-of-sample rate
# RMSE
postResample( tr.stru$pred$pred, tr.stru$pred$obs ) # the out-of-sample RMSE and R-squared

#################
# MODEL of outcome using smoothed polling data
################
library(caret); set.seed(13243)

run.finalsynthetic <- function(data=sm.agg, meth="pls", grid=expand.grid(ncomp=2)){
  sm.agg <- sm.agg[order(sm.agg$Round.Date), ]
  trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
  #linear partial least squares
  trmode <- train(realmarg~sm.pollmarg+l1polity2+wb.inflation, data=sm.agg, trControl=trcontrol, metric="RMSE", method=meth, tuneGrid=grid) 
  return(trmode)
}

#Getting the final results and plotting the relationship:

#Overall model outcome
sm.agg <- sm.agg[order(sm.agg$Round.Date), ]
trmode <- run.finalsynthetic(data=sm.agg, meth="pls", grid=expand.grid(ncomp=2))
#In-sample Results
trmod.insample <- train(realmarg~sm.pollmarg+l1polity2+wb.inflation, data=sm.agg[1:50,], trControl= trainControl(method="none", savePredictions = T), metric="RMSE", method="pls", tuneGrid= expand.grid(ncomp=2)) 
sum( predict(trmod.insample, newdata=sm.agg[1:50, ])*sm.agg$realmarg[1:50] > 0 )/50 # the in-sample rate
# Note, compared to polls alone in training sample, the polls are only 80% accurate
sum( sm.agg$pollmarg[1:50]*sm.agg$realmarg[1:50] >= 0 )/50
postResample( predict(trmod.insample, newdata=sm.agg[1:50, ]), sm.agg$realmarg[1:50]) # the in-sample rate
#Out-of-sample Results
sum( (trmode$pred$pred* trmode$pred$obs) >= 0) / length(trmode$pred$pred) #the out-of-sample rate
postResample(trmode$pred$pred, trmode$pred$obs)  #the out-of-sample RMSE

#Plots of smoothing, outcome, and map
trmode$pred$result <- ifelse((trmode$pred$pred* trmode$pred$obs) > 0, "correct", "incorrect")
p1 = qplot(pred, obs, data=trmode$pred, colour=result) + geom_smooth(method="loess", span=1) + ylab("Observed Margin of Incumbent") + xlab("Model Prediction - Out of Sample")
p2 = ggplot(pdat1, aes(x=sm.pollmarg, y=realmarg, colour=electionid)) + geom_point( alpha=.4) + theme(legend.position="none") + ylab("Real Margin of Incumbent") + xlab("Smoothed Polling Margin")
#multiplot(p2, p1, cols=2) 
#Figure 2 graphs:
p1
p2

#MAP OF COUNTRY Result out of SAMPLE, the map package will overplot if there are duplicates, so *correctly* predicted duplicate countries are removed
library(rworldmap)
sm.agg$ISO3 <- countrycode(as.numeric(substr(sm.agg$electionid, 1, 3)), origin="cown", destination="iso3c")

outsample <- sm.agg[order(sm.agg$Round.Date), ]
outsample <- outsample[51:146, ]
outsample$pred <- 1*(trmode$pred$pred*trmode$pred$obs > 0)
dfo.lose <- data.frame(ISO3 = unique(outsample$ISO3[outsample$pred==0]), pred=0)
dfo.win <- data.frame(ISO3 = unique(outsample$ISO3[outsample$pred==1]), pred=1)
dfo.win <- dfo.win[!dfo.win$ISO3 %in% dfo.lose$ISO3, ]
dfo <- rbind(dfo.win, dfo.lose)
dfo$out <- ifelse(dfo$pred==1, "Correct", "Incorrect")
sPDF <- joinCountryData2Map(dfo, joinCode = "ISO3", nameJoinColumn = "ISO3")
par(mar=c(0.5,0.5,0.5,0.5))
# Figure 2 map:
mapCountryData(sPDF, nameColumnToPlot = "out", mapTitle = "", colourPalette = brewer.pal(3, "RdBu")[c(1,3)], catMethod="categorical", addLegend = F, oceanCol="white", missingCountryCol = "dark grey", borderCol = NA)

# Which countries were correctly and incorrectly predicted?

paste(outsample$ISO3[outsample$pred==1], outsample$year[outsample$pred==1], sep="-")
paste(outsample$ISO3[outsample$pred==0], outsample$year[outsample$pred==0], sep="-") 

# The random effect plots for the smoothing model
sm.mod <- lmer(pollmarg ~ pro.incumbent + (pro.incumbent|incRun) + (1|asp.gdp.bin) + (1|electionid)+(1|ccode)+(1|Pollster)+(1|region), na.action=na.exclude, weights=wts, data=dat, control=lmerControl(optimizer="bobyqa", boundary.tol = 1e-2, check.scaleX="silent.rescale")) 

# parse the variance of the model here:
as.data.frame(VarCorr(sm.mod))

plotfunk <- function(model=sm.mod, variable.name="none", ordering=T){
  randoms<-ranef(model, condVar = TRUE)
  qq <- attr(ranef(model, condVar = TRUE)[[variable.name]], "postVar")
  rand.interc<-randoms[[variable.name]]
  df<-data.frame(Intercepts=randoms[[variable.name]] [,1],
                 sd.interc=2*sqrt(qq[,,1:length(qq)]),
                 lev.names=rownames(rand.interc))
  df <- df[order(df$Intercepts), ]
  if(ordering){
    df <- df[order(df$lev.names), ]
  }
  df$lev.names<-factor(df$lev.names,levels=unique(df$lev.names) )
  return(df) 
}

# PLOTTING ASP.GDP AGAINST MARGIN with the Binned Data: (Second part of Fig S15)
levs <- strsplit(gsub("\\(|\\]", "", as.character(dat$asp.gdp.bin)), ",")
levs <- sapply(levs, function(x) mean(as.numeric(x)))
flevs <- factor(levs, labels=1:length(unique(levs)[-20]))
df <- na.omit(data.frame(levs=levs, flevs=flevs, Real.Margin = dat$realmarg, Incumbent.Running = as.factor(dat$incRun)))
p <- ggplot(df, aes(as.factor(levs), Real.Margin)) +  xlab("Aspirational GDP") + ylab("Incumbent Margin") 
p + geom_boxplot()  +  geom_jitter(width=.3, aes(colour=Incumbent.Running))


# PLOTTING POLLING HOUSE EFFECTS WITH ERROR BARS: Figure 3
df <- plotfunk(sm.mod, variable.name="Pollster", ordering=F)
p <- ggplot(df,aes(factor(lev.names), Intercepts)) + theme(legend.position="none") + geom_smooth() +xlab("Polling House Effects") + theme(axis.text.x=element_blank()) 
p <- p + geom_hline(yintercept=0) +geom_errorbar(aes(ymin=Intercepts-sd.interc, ymax=Intercepts+sd.interc), col="dark grey", width=0) + geom_point(size=0) 
p + annotate("text", x=200, y=-20, label="Pro-Opposition Pollsters") + annotate("text", x=600, y=20, label="Pro-Incumbent Pollsters") 

#Table of included variables:
library(stargazer)
stargazer(dat[, c("pollmarg", "sm.pollmarg", "incRun", "pro.incumbent", "lgdp", "asp.gdp")], type="html", out="~/Downloads/step1data.html")


###################################******************************************************
########### SUPPORTING INFORMATION FILES
#################################*********************************************

# Figure S6
#Histograms and figures for SI:
p1 <- qplot(pollmarg, data=dat, bins=200) + xlab("Polling Margin") + scale_x_continuous(limits=c(-50, 60))
p2 <- qplot(realmarg, data=sm.agg, bins=50) + xlab("Real Margin") + scale_x_continuous(limits=c(-50, 60))
#multiplot(p1, p2)
p1
p2

#Figure S7
p <- qplot(x=pollmarg, y=realmarg, col=region, data=dat, alpha=.1, size=1.5) + xlab("Polling Margin of Incumbent") + ylab("Real Margin of Incumbent")
p


# THE NEXT PLOT - SCATTER OF VARIABLES - FIGURE S8
library(GGally)
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, pollmarg, realmarg, year, region=as.numeric(as.factor(region)), incRun, incApp, incExtend,  multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat1)
ggpairs(sm.agg[, c(2,13,14)]) # plot matrix of predictors in second stage PLS model

# Table of variables - Table S8. 
library(stargazer)
stargazer(dat[, c("pollmarg", "sm.pollmarg", "incRun", "pro.incumbent", "asp.gdp")], type="html", out="~/Downloads/step1data.html")
stargazer(sm.agg[, c('realmarg', 'pollmarg', 'sm.pollmarg', 'l1polity2', 'wb.inflation')], type="html", out="~/Downloads/step2data.html")

# Least Squares in the Second Stage:
sm.agg = sm.agg[order(sm.agg$Round.Date), ]
trmod.insample <- train(realmarg~sm.pollmarg+l1polity2+wb.inflation, data=sm.agg[1:50,], trControl= trainControl(method="none"), metric="RMSE", method="lm") 
sum( predict(trmod.insample, newdata=sm.agg[1:50, ])*sm.agg$realmarg[1:50] > 0 )/50 # the in-sample rate
trmodlm <- run.finalsynthetic(meth='lm', grid=NULL)
sum( (trmodlm$pred$pred* trmodlm$pred$obs) > 0) / length(trmodlm$pred$pred) #the out-of-sample rate
postResample(trmodlm$pred$pred, trmodlm$pred$obs)  #the out-of-sample RMSE
stargazer(trmodlm$finalModel, type="html", out="~/Downloads/trmodlm.html")
# Table S10
stargazer(trmodlm$finalModel)
# Figure S9
trmodlm$pred$result <- ifelse((trmodlm$pred$pred* trmodlm$pred$obs) > 0, "correct", "incorrect")
qplot(pred, obs, data=trmodlm$pred, colour=result) + geom_smooth(method="loess", span=1) + ylab("Observed Margin of Incumbent") + xlab("Linear Model Prediction - Out of Sample")


# Complete Pooling in the first stage: Figure S10, S11
# ***See supplemental_complete_pooling_analysis.R

# No Pooling in the first stage: This section has been updated and added to the SI
regfunk <- function(x){
  #create prediction object
  x$sm.pollmarg <- NA
  x <- x[order(x$Round.Date), ]
  
  for(i in 1:length(unique(x$electionid))){
    ###############
    curr.rows <- x$electionid %in% unique(x$electionid)[i] #get the current electionid
    curr.year <- unique(x$year[curr.rows]) #get corresponding year(s)
    all.curr.rows <- which(x$year <= max(curr.year, na.rm=T)) #get rows corresponding to that election and all prior elections
    ################
    #make prediction
    if( dim(na.omit(x[ all.curr.rows , c("pollmarg", "pro.incumbent", "incRun", "asp.gdp")]))[1] >= 1 ){ #if the current number of rows is greater than the degrees of freedom
      mod <- lm(pollmarg~pro.incumbent + incRun + asp.gdp , data= x[ all.curr.rows , ])
      x$sm.pollmarg[curr.rows] <- predict(mod, newdata=x[curr.rows, ]) #predict only current election
    }
    else {
      print(paste("using alt. model in election", unique(x$electionid)[i]))
      x$sm.pollmarg[curr.rows] <- mean(x$pollmarg[all.curr.rows], na.rm=T)
    }
    
  }
  return(x)
}
cc <- as.factor(dat$ccode)
sdat <- split(dat, cc)
sdat <- lapply(sdat, regfunk) # this will produce warnings, because some elections w/ very few polls
nopool <- do.call(rbind, sdat)
sm.agg <- aggregate(cbind(sm.pollmarg, pollmarg, realmarg, year, l1polity2, wb.inflation) ~electionid, median, data=nopool[!is.na(nopool$Candidate.1.Incumbent.Poll.Vote.Share),])
sm.agg <- sm.agg[sm.agg$electionid %in% elections.training$electionid, ] #get matching sample to partial pooling
sm.agg <- sm.agg[order(sm.agg$year), ]
trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
trmod.nopool <- train(realmarg~sm.pollmarg+wb.inflation+l1polity2, data=sm.agg, trControl=trcontrol, metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 
mod <- trmod.nopool

#Figure S12
sum( mod$pred$pred*mod$pred$obs > 0 )/nrow(mod$pred) # the out of sample accuracy rate 
postResample(mod$pred$pred, mod$pred$obs)  #the out-of-sample RMSE
mod$pred$result <- ifelse((mod$pred$pred* mod$pred$obs) > 0, "correct", "incorrect")
qplot(pred, obs, data=mod$pred, colour=result) + geom_smooth(method="loess") + ylab("Observed Margin of Incumbent") + xlab("No Pooling Model Prediction")

# Multiparty elections Figure S13
trmode$pred$Election_Type <- ifelse(trmode$trainingData$multiparty[51:146] > 0, "Multiparty", "Two-Party")
qplot(pred, obs, data=trmode$pred, colour=Election_Type) + geom_smooth(method="loess") + ylab("Observed Margin of Incumbent") + xlab("Model Prediction")
# Table of results for multiparty elections
mod <- lm(realmarg~sm.pollmarg+wb.inflation+multiparty+l1polity2, data=sm.agg)

# Lead Time: Figure S14
# *** SEE supplemental_iterated_lead_tests.R

# FIGURE S14::::
#show relationship between temporally-normalized growth and Margin of Incumbent
p <- qplot(asp.gdp, realmarg, data=dat) + geom_smooth() + xlab("Aspirational GDP") + ylab("Real Margin of Incumbent")


# Economic Variables: ###############################
# GDP
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, lgdp, realmarg, year, region=as.numeric(as.factor(region)), incRun, multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat1)
p1 <- qplot(lgdp, realmarg, data=dat) + geom_smooth() + xlab("GDP (logged)") + ylab("Real Margin of Incumbent")

# GDP growth
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, gdp.growth, realmarg, year, region=as.numeric(as.factor(region)), incRun, multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat1)
p2 <- qplot(gdp.growth, realmarg, data=dat) + geom_smooth() + xlab("GDP Growth") + ylab("Real Margin of Incumbent")

# Unemployment
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, total_unemployment,  realmarg, year, region=as.numeric(as.factor(region)), incRun,  multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat1)
p3 <- qplot(total_unemployment, realmarg, data=dat) + geom_smooth() + xlab("Unemployment Rate") + ylab("Real Margin of Incumbent")

# Inflation from WB
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, total_unemployment,  realmarg, year, region=as.numeric(as.factor(region)), incRun,  multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat1)
p4 <- qplot(wb.inflation, realmarg, data=dat) + geom_smooth() + xlab("Inflation Rate") + ylab("Real Margin of Incumbent")

# Plotting all the subfigures of figure S14
p1
p2
p3
p4

# Incumbent Fatigue: Figure S17
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, pollmarg, realmarg, year, region=as.numeric(as.factor(region)), incRun, incApp, incExtend,  multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=pdat1)
sm.agg = sm.agg[order(sm.agg$Round.Date), ]
bts <- read.csv("incumbent_fatigue.csv")
bts <- merge(bts, sm.agg, by="electionid", all.y=T)
bts <- bts[order(bts$Round.Date), ]
trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
#linear least squares - no effect for incumbent fatigue, no improvement in predictive power
trmod.fatigue <- train(realmarg~sm.pollmarg+l1polity2+wb.inflation+spell, data=na.omit(bts), trControl=trcontrol, metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 
sum( trmod.fatigue$pred$pred*trmod.fatigue$pred$obs > 0 )/nrow(trmod.fatigue$pred) # the overall rate
postResample(trmod.fatigue$pred$pred, trmod.fatigue$pred$obs)
p3 <- qplot(spell, realmarg, data=bts) + geom_smooth() + xlab("Period of Incumbent Party in Power") + ylab("Real Margin of Incumbent")

# NOT FIGURES IN THE SI
# Prior Margin
sm.agg <- aggregate(cbind(sm.pollmarg, ccode, realmarg, year, l1polity2, wb.inflation)~electionid, median, data=pdat1)
sm.agg <- sm.agg[order(sm.agg$year), ]
spdat <- split(sm.agg, sm.agg$ccode)
lagfunk <- function(x){
  # Simple function to retrieve lagged values of realmarg
  if(nrow(x)>1){
    x$l.realmarg <- c(0, x$realmarg[-nrow(x)]) #Produce zero if no lag
  } else {
    x$l.realmarg <- 0
  }
  return(x)
} #get lagged margin
spdat <- lapply(spdat, lagfunk) # Split, do the lag function
pmarg <- do.call(rbind, spdat) #bind all the split data back together
pmarg <- pmarg[order(pmarg$year), ]
trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
#partial least squares - achieves similar accuracy
trmod.priormarg <- train(realmarg~sm.pollmarg+l1polity2+wb.inflation+l.realmarg, data=pmarg, trControl=trcontrol, metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 
mod <- trmod.priormarg
sum( mod$pred$pred*mod$pred$obs > 0 )/nrow(mod$pred) # the overall rate
postResample(mod$pred$pred, mod$pred$obs)  #the out-of-sample RMSE


# Unemployment Rate - we lose 20 elections due to poor coverage
trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
sm.agg <- aggregate(cbind(sm.pollmarg, Round.Date, total_unemployment, realmarg, year, l1polity2, wb.inflation) ~electionid, median, data=pdat1)
sm.agg = sm.agg[order(sm.agg$Round.Date), ]
trmod.unemp <- train(realmarg~sm.pollmarg+l1polity2+wb.inflation+total_unemployment, data=sm.agg, trControl=trcontrol, metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 
mod <- trmod.unemp
sum( mod$pred$pred*mod$pred$obs > 0 )/nrow(mod$pred) # the out of sample accuracy rate
postResample(mod$pred$pred, mod$pred$obs)  #the out-of-sample RMSE

# Other democracy measure - Cheibub, Gandhi and Vreeland
trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
sm.agg <- aggregate(cbind(sm.pollmarg, democ_cgv, Round.Date, realmarg, year, l1polity2, wb.inflation)~electionid, median, data=pdat1)
sm.agg = sm.agg[order(sm.agg$Round.Date), ]
trmod.cgv <- train(realmarg~sm.pollmarg+wb.inflation+democ_cgv, data=sm.agg, trControl=trcontrol, metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 
mod <- trmod.cgv
sum( mod$pred$pred*mod$pred$obs > 0 )/nrow(mod$pred) # the out of sample accuracy rate
postResample(mod$pred$pred, mod$pred$obs)  #the out-of-sample RMSE

# Other democracy measure - Freedom House
trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
sm.agg <- aggregate(cbind(sm.pollmarg, democ_fh, realmarg, year, l1polity2, wb.inflation) ~electionid, median, data=pdat1)
sm.agg = sm.agg[order(sm.agg$Round.Date), ]
trmod.fh <- train(realmarg~sm.pollmarg+wb.inflation+democ_fh, data=sm.agg, trControl=trcontrol, metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 
mod <- trmod.fh
sum( mod$pred$pred*mod$pred$obs > 0 )/nrow(mod$pred) # the out of sample accuracy rate
postResample(mod$pred$pred, mod$pred$obs)  #the out-of-sample RMSE
#p3 <- qplot(mod$pred$pred, mod$pred$obs, data=mod$pred) + geom_smooth() + xlab("Period of Incumbent Party in Power") + ylab("Real Margin of Incumbent")


################
# NELDA FIGURE S18
###############

#** Disputes with NELDA: Plotting the median poll
sm.agg <- aggregate(cbind(multiparty, realmarg, pollmarg, incApp, pollExist, incLose, year, l1polity2, wb.inflation) ~electionid, median, data=dat)
p1 <- ggplot(sm.agg, aes(x=factor(incApp), y=pollmarg)) + geom_boxplot() + xlab("Incumbent Party Ahead in Reliable Polls (NELDA)") + ylab("Calculated Point Margin of Incumbent Party in Polls")
table(sm.agg$pollmarg>0, sm.agg$incApp)

#** Disputes with NELDA: Plotting the outcome of the election for the incumbent party
p <- ggplot(sm.agg, aes(x=factor(incLose), y=realmarg)) + geom_boxplot() #+ geom_point(col=sm.agg$incLose+2, size=sqrt(sm.agg$l1polity2+10) )
p2 <-  p + xlab("Incumbent Party Candidate Lost Election (NELDA)") + ylab("Real Margin of Incumbent Candidate")
mod <- glm(incLose~realmarg, data=sm.agg)
table(sm.agg$realmarg<0, sm.agg$incLose)

#Plot subfigures for S19:
p1
p2

### Re-running the model without the NELDA disputes:
sm.agg <- aggregate(cbind(Round.Date, realmarg, sm.pollmarg, incApp, pollExist, incLose, year, l1polity2, wb.inflation) ~electionid, median, data=pdat1)
sm.agg$agree <- sm.agg$realmarg>0 & sm.agg$incLose==0 | sm.agg$realmarg<0 & sm.agg$incLose==1
sm.agg <- sm.agg[order(sm.agg$Round.Date), ]
trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
#linear partial least squares
trmod.nelda <- train(realmarg~sm.pollmarg+l1polity2+wb.inflation, data=sm.agg[sm.agg$agree, ], trControl=trcontrol, metric="RMSE", method='pls', tuneGrid=expand.grid(ncomp=2)) 
sum( trmod.nelda$pred$pred*trmod.nelda$pred$obs > 0 )/nrow(trmod.nelda$pred) # the out of sample accuracy rate
postResample(trmod.nelda$pred$pred, trmod.nelda$pred$obs)

######################
# CHANGE MODEL ORDER - of polling results - Figure S19
##################
sm.agg <- aggregate(cbind(Round.Date, pollmarg, ccode, realmarg, year, l1polity2, wb.inflation)~electionid, median, data=dat)
sm.agg <- sm.agg[order(sm.agg$year), ]
spdat <- split(sm.agg, sm.agg$ccode)
lagfunk <- function(x){
  # Simple function to retrieve lagged values of realmarg
  if(nrow(x)>1){
    x$l.realmarg <- c(0, x$realmarg[-nrow(x)]) #Produce zero if no lag
  } else {
    x$l.realmarg <- 0
  }
  return(x)
} #get lagged margin
spdat <- lapply(spdat, lagfunk) # Split, do the lag function
pmarg <- do.call(rbind, spdat) #bind all the split data back together
pmarg <- pmarg[order(pmarg$Round.Date), ]
trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut

# get elections that are trained and tested
elections.training <- read.csv("elections_training_wround09082016.csv")

# PREDICT THE NATIONAL-LEVEL VOTE SHARE BASED ON THE GROWTH AND TIME INCUMBENT HAS BEEN IN POWER
training_mod = lm(realmarg~l.realmarg+l1polity2+wb.inflation, data=pmarg[1:50,])
pmarg$timeforchangepred = NA
pmarg$timeforchangepred[1:50] = predict(training_mod, newdata=pmarg[1:50, ])
trmod.hist <- train(realmarg~l.realmarg+l1polity2+wb.inflation, data=pmarg, trControl=trcontrol, tuneGrid=expand.grid(ncomp=2), metric="RMSE", method="pls") 

# Accuracy of the historical model 
sum( trmod.hist$pred$pred*trmod.hist$pred$obs >= 0 )/nrow(trmod.hist$pred) # the overall rate
postResample(trmod.hist$pred$pred, trmod.hist$pred$obs)
# Put out of sample predictions into the data
pmarg$timeforchangepred[trmod.hist$pred$rowIndex] = trmod.hist$pred$pred
# creat minimal data set
tf = data.frame(electionid = pmarg$electionid, timeforchangepred = pmarg$timeforchangepred)

# USE THE NATIONAL-LEVEL VOTE SHARE PREDICION AS A FEATURE
dat = merge(dat, tf, by="electionid")
get_preds <- function(pdat=dat, trainingset=elections.training, trainingsize= 50 , leadTime = 0){
  pdat$wts <- scale(1/(pdat$days.to.elec-leadTime+2), center=F) #
  
  # Remove missing polls and dates
  pdat <- pdat[!is.na(pdat$pollmarg), ]
  pdat <- pdat[!is.na(pdat$Poll.Date), ]
  
  for(i in 1:sum(elections.training$trainingset==0)){
    
    elecs <- pdat$electionid %in% elections.training$electionid[ 1:(trainingsize+i)] #get T/F in or out of training set
    
    #########
    sm.mod <- try(lmer(pollmarg ~ timeforchangepred + pro.incumbent + (pro.incumbent|incRun) + (1|asp.gdp.bin) +(1|electionid)+(1|ccode)+(1|Pollster)+(1|region), na.action=na.exclude, weights=wts, data=pdat[ elecs, ], control=lmerControl(optimizer="bobyqa", boundary.tol = 1e-2, check.scaleX="silent.rescale")) )
    #error handling
    w <- warnings(); assign("last.warning", NULL, envir = baseenv()) #assign warning to w, then delete all warnings
    
    #######
    if(i==1){ #if it is the first round of the loop, predict the training set plus the first test election
      pdat$sm.pollmarg[ elecs] <- predict(sm.mod, newdata=pdat[ elecs,])    
    }
    #######
    if(i>1 ){ #if it is not the first round of the loop, predict just the next test election
      next.elec <- pdat$electionid %in% elections.training$electionid[ trainingsize + i ] #get just the election to make smoothed pred.
      pdat$sm.pollmarg[ next.elec ] <- predict(sm.mod, newdata=pdat[next.elec, ]) #produces a prediction
      print(elections.training$electionid[ trainingsize + i ])
    }
    print(paste(i, "successfully completed"))
  }
  return(pdat)
}

final_df = get_preds(dat, trainingset = elections.training, trainingsize = 50, leadTime=0)

sm.agg <- aggregate(cbind(timeforchangepred, pollmarg, realmarg, year, gdp.growth, sm.pollmarg, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=final_df)
# prediction based on the model, no calibration
sum( sm.agg$sm.pollmarg*sm.agg$realmarg >= 0 )/nrow(sm.agg) # the overall rate
postResample( sm.agg$sm.pollmarg, sm.agg$realmarg ) # the overall rate

sm.agg$result <- ifelse((sm.agg$sm.pollmarg* sm.agg$realmarg) > 0, "correct", "incorrect")
qplot( sm.pollmarg, realmarg, data=sm.agg, colour=result) +  geom_smooth() + xlab("Prediction") + ylab("Real Margin of Incumbent")

## FIGURE S20: ** SEE APSA_prediction.R

###########
###### *****  END
